#include <sys/cdefs.h>
#ifdef __sw_slave__
#ifdef __sw_regcomm__
#include "simd_cpp.hpp"
__always_inline void reg_reduce_inplace_doublev4(double *arrs, int len) {
  vreal *arr = (vreal*)arrs;
  int i, j;
  vreal tmp;
  for (i = 1; i < 8; i += i) {
    if ((_ROW & i) == i) {
      for (j = 0; j < len; j++)
        asm("putc %0, %1"
            :
            : "r"(arr[j]), "r"(_ROW ^ i));
    }
    if ((_ROW & i) == 0) {
      for (j = 0; j < len; j++) {
        asm(
            "getc %0\n\t"
            "vaddd %0, %1, %1\n\t"
            : "=r"(tmp), "+r"(arr[j]));
        //arr[j] += tmp;
      }
    }
    qthread_sync();
  }
  qthread_syn();
  if (_ROW == 0) {
    for (i = 1; i < 8; i += i) {
      if ((_COL & i) == i) {
        for (j = 0; j < len; j++)
          asm("putr %0, %1"
              :
              : "r"(arr[j]), "r"(_COL ^ i));
      }
      if ((_COL & i) == 0) {
        for (j = 0; j < len; j++) {
          /* asm("getr %0\n" : "=r"(tmp)); */
          /* arr[j] += tmp; */
          asm(
              "getr %0\n\t"
              "vaddd %0, %1, %1\n\t"
              : "=r"(tmp), "+r"(arr[j]));
        }
      }
    }
    qthread_synr();
  }
}
#elif defined(__sw_rldst__)
#include "simd_cpp.hpp"
__attribute__((weak)) __attribute__((section(".ldm"))) vreal reg_reduce_buf[64];
INLINE void reg_reduce_inplace_doublev4(double *arr, int len) {
  long reg_reduce_buf_addr = (long)reg_reduce_buf;
  reg_reduce_buf_addr |= 0x200000000000L;
  vreal *reduce_buf = reg_reduce_buf_addr;
  int n = len * 4;
  qthread_syn();
  for (int i = 0; i < n; i += 8) {
    vreal tmp = v_ldd(arr + i);
    reduce_buf[_MYID] = tmp;
    qthread_syn();
    if (!_MYID) {
      double sum[8];
      vreal sumv8 = v_set1d(0);
      for (int j = 0; j < 64; j ++){
        sumv8 += reg_reduce_buf[j];
      }
      v_std(sum, sumv8);
      for (int j = 0; i + j < n && j < 8; j ++) {
        arr[i + j] = sum[j];
      }
    }
    qthread_syn();
  }
}
#endif
#endif
